<!DOCTYPE html>



  


<html class="theme-next muse use-motion" lang="zh-Hans">
<head><meta name="generator" content="Hexo 3.8.0">
  <meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
<meta name="theme-color" content="#222">









<meta http-equiv="Cache-Control" content="no-transform">
<meta http-equiv="Cache-Control" content="no-siteapp">
















  
  
  <link href="/lib/fancybox/source/jquery.fancybox.css?v=2.1.5" rel="stylesheet" type="text/css">







<link href="/lib/font-awesome/css/font-awesome.min.css?v=4.6.2" rel="stylesheet" type="text/css">

<link href="/css/main.css?v=5.1.4" rel="stylesheet" type="text/css">


  <link rel="apple-touch-icon" sizes="180x180" href="/images/apple-touch-icon-next.png?v=5.1.4">


  <link rel="icon" type="image/png" sizes="32x32" href="/images/favicon-32x32-next.png?v=5.1.4">


  <link rel="icon" type="image/png" sizes="16x16" href="/images/favicon-16x16-next.png?v=5.1.4">


  <link rel="mask-icon" href="/images/logo.svg?v=5.1.4" color="#222">





  <meta name="keywords" content="Hexo, NexT">





  <link rel="alternate" href="/atom.xml" title="renhao" type="application/atom+xml">






<meta name="description" content="毕业论文中的相关工作">
<meta property="og:type" content="article">
<meta property="og:title" content="related work for graduation thesis of undergraduate">
<meta property="og:url" content="https://huangrenhao.gitee.io/2018/12/20/thesis-undergraduate/index.html">
<meta property="og:site_name" content="renhao">
<meta property="og:description" content="毕业论文中的相关工作">
<meta property="og:locale" content="zh-Hans">
<meta property="og:image" content="https://weitinglindotcom.files.wordpress.com/2016/01/screenshot24.png?w=620">
<meta property="og:image" content="https://weitinglindotcom.files.wordpress.com/2016/01/screenshot25.png?w=620">
<meta property="og:image" content="https://weitinglindotcom.files.wordpress.com/2016/01/screenshot21.png">
<meta property="og:updated_time" content="2020-09-03T08:50:37.000Z">
<meta name="twitter:card" content="summary">
<meta name="twitter:title" content="related work for graduation thesis of undergraduate">
<meta name="twitter:description" content="毕业论文中的相关工作">
<meta name="twitter:image" content="https://weitinglindotcom.files.wordpress.com/2016/01/screenshot24.png?w=620">



<script type="text/javascript" id="hexo.configurations">
  var NexT = window.NexT || {};
  var CONFIG = {
    root: '/',
    scheme: 'Muse',
    version: '5.1.4',
    sidebar: {"position":"left","display":"post","offset":12,"b2t":false,"scrollpercent":false,"onmobile":false},
    fancybox: true,
    tabs: true,
    motion: {"enable":true,"async":false,"transition":{"post_block":"fadeIn","post_header":"slideDownIn","post_body":"slideDownIn","coll_header":"slideLeftIn","sidebar":"slideUpIn"}},
    duoshuo: {
      userId: '0',
      author: '博主'
    },
    algolia: {
      applicationID: '',
      apiKey: '',
      indexName: '',
      hits: {"per_page":10},
      labels: {"input_placeholder":"Search for Posts","hits_empty":"We didn't find any results for the search: ${query}","hits_stats":"${hits} results found in ${time} ms"}
    }
  };
</script>



  <link rel="canonical" href="https://huangrenhao.gitee.io/2018/12/20/thesis-undergraduate/">





  <title>related work for graduation thesis of undergraduate | renhao</title>
  








<link rel="stylesheet" href="/css/prism-tomorrow.css" type="text/css"></head>

<body itemscope="" itemtype="http://schema.org/WebPage" lang="zh-Hans">
  
  
    
  

  <div class="container sidebar-position-left page-post-detail">
    <div class="headband"></div>

    <header id="header" class="header" itemscope="" itemtype="http://schema.org/WPHeader">
      <div class="header-inner"><div class="site-brand-wrapper">
  <div class="site-meta ">
    

    <div class="custom-logo-site-title">
      <a href="/" class="brand" rel="start">
        <span class="logo-line-before"><i></i></span>
        <span class="site-title">renhao</span>
        <span class="logo-line-after"><i></i></span>
      </a>
    </div>
      
        <p class="site-subtitle"></p>
      
  </div>

  <div class="site-nav-toggle">
    <button>
      <span class="btn-bar"></span>
      <span class="btn-bar"></span>
      <span class="btn-bar"></span>
    </button>
  </div>
</div>

<nav class="site-nav">
  

  
    <ul id="menu" class="menu">
      
        
        <li class="menu-item menu-item-home">
          <a href="/" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-home"></i> <br>
            
            首页
          </a>
        </li>
      
        
        <li class="menu-item menu-item-categories">
          <a href="/categories/" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-th"></i> <br>
            
            分类
          </a>
        </li>
      
        
        <li class="menu-item menu-item-archives">
          <a href="/archives/" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-archive"></i> <br>
            
            归档
          </a>
        </li>
      

      
    </ul>
  

  
</nav>



 </div>
    </header>

    <main id="main" class="main">
      <div class="main-inner">
        <div class="content-wrap">
          <div id="content" class="content">
            

  <div id="posts" class="posts-expand">
    

  

  
  
  

  <article class="post post-type-normal" itemscope="" itemtype="http://schema.org/Article">
  
  
  
  <div class="post-block">
    <link itemprop="mainEntityOfPage" href="https://huangrenhao.gitee.io/2018/12/20/thesis-undergraduate/">

    <span hidden itemprop="author" itemscope="" itemtype="http://schema.org/Person">
      <meta itemprop="name" content="renhao">
      <meta itemprop="description" content="">
      <meta itemprop="image" content="/images/avatar.gif">
    </span>

    <span hidden itemprop="publisher" itemscope="" itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="renhao">
    </span>

    
      <header class="post-header">

        
        
          <h1 class="post-title" itemprop="name headline">related work for graduation thesis of undergraduate</h1>
        

        <div class="post-meta">
          <span class="post-time">
            
              <span class="post-meta-item-icon">
                <i class="fa fa-calendar-o"></i>
              </span>
              
                <span class="post-meta-item-text">发表于</span>
              
              <time title="创建于" itemprop="dateCreated datePublished" datetime="2018-12-20T09:50:56+08:00">
                2018-12-20
              </time>
            

            

            
          </span>

          
            <span class="post-category">
            
              <span class="post-meta-divider">|</span>
            
              <span class="post-meta-item-icon">
                <i class="fa fa-folder-o"></i>
              </span>
              
                <span class="post-meta-item-text">分类于</span>
              
              
                <span itemprop="about" itemscope="" itemtype="http://schema.org/Thing">
                  <a href="/categories/bioinformatic/" itemprop="url" rel="index">
                    <span itemprop="name">bioinformatic</span>
                  </a>
                </span>

                
                
              
            </span>
          

          
            
              <span class="post-comments-count">
                <span class="post-meta-divider">|</span>
                <span class="post-meta-item-icon">
                  <i class="fa fa-comment-o"></i>
                </span>
                <a href="/2018/12/20/thesis-undergraduate/#comments" itemprop="discussionUrl">
                  <span class="post-comments-count valine-comment-count" data-xid="/2018/12/20/thesis-undergraduate/" itemprop="commentCount"></span>
                </a>
              </span>
            
          

          
          

          

          
            <div class="post-wordcount">
              
                
                <span class="post-meta-item-icon">
                  <i class="fa fa-file-word-o"></i>
                </span>
                
                  <span class="post-meta-item-text">字数统计&#58;</span>
                
                <span title="字数统计">
                  7.2k
                </span>
              

              
                <span class="post-meta-divider">|</span>
              

              
                <span class="post-meta-item-icon">
                  <i class="fa fa-clock-o"></i>
                </span>
                
                  <span class="post-meta-item-text">阅读时长 &asymp;</span>
                
                <span title="阅读时长">
                  32
                </span>
              
            </div>
          

          

        </div>
      </header>
    

    
    
    
    <div class="post-body" itemprop="articleBody">

      
      

      
        <p>毕业论文中的相关工作<a id="more"></a></p>
<h2 id="IGV"><a href="#IGV" class="headerlink" title="IGV"></a>IGV</h2><h3 id="features"><a href="#features" class="headerlink" title="features"></a>features</h3><ul>
<li><p>integrative genomics viewer </p>
</li>
<li><p>desktop application and all major platforms(Windows,Linux and Mac)</p>
</li>
<li><p>high-performance vistualization</p>
</li>
<li><p>support loading of local and remote data sets</p>
</li>
<li><p>igv handles large heterogenous data set</p>
</li>
<li><p>ability to view data in multiple genomic regions simultaneously in adjacent panels(相邻面板可以同时展示多个基因区域)</p>
</li>
</ul>
<h3 id="支持的输入数据格式："><a href="#支持的输入数据格式：" class="headerlink" title="支持的输入数据格式："></a>支持的输入数据格式：</h3><ul>
<li><p>序列对比(NGS data)：<code>sam</code>,<code>bam</code>,<code>Goby</code></p>
</li>
<li><p>基因注释：<code>gtf</code>,<code>gff2</code>,<code>bed</code>,<code>psl</code></p>
</li>
<li><p>显示覆盖率：<code>tdf</code></p>
</li>
<li><p>拷贝数：<code>snp</code>、<code>cn</code></p>
</li>
<li><p>基因表达： <code>gct</code>、<code>res</code></p>
</li>
<li><p>突变数据(variant calls)： <code>mut</code>,<code>vcf</code></p>
</li>
<li><p>追踪参考基因组覆盖率、测序深度（UCSC）：<code>wig</code>、<code>bw</code></p>
</li>
<li><p>或者可以分为下面三类</p>
<ul>
<li><p>nonindexed : <code>gff</code>,<code>bed</code>,<code>wig</code></p>
<p>(must be read in their entirety and only suitable for relatively small data sets)</p>
</li>
<li><p>indexed : <code>bam</code>,<code>Goby</code></p>
<p>efficiently support views only for a limited range of resolution scales</p>
</li>
<li><p>multiresolution : <code>tdf</code>,<code>bigWig</code>,<code>bigBed</code></p>
<p>include both an index for the row data and precomputed indexed summary data for lower resolution(zoomed out) scales.–&gt;efficiently support views at any resolution scale</p>
</li>
</ul>
</li>
</ul>
<h3 id="igv对输入文件进行了什么处理（如何进行各种文件的可视化展示）"><a href="#igv对输入文件进行了什么处理（如何进行各种文件的可视化展示）" class="headerlink" title="igv对输入文件进行了什么处理（如何进行各种文件的可视化展示）?"></a>igv对输入文件进行了什么处理（如何进行各种文件的可视化展示）?</h3><ul>
<li><p>tiled data format(TDF)：(输入文件将被转换成这个二进制文件格式)</p>
<ul>
<li>feature : stores the pyramidal(金字塔形的) data tile structure and provides fast access to individual tiles</li>
<li>purpose : to support the multiresolution(多分辨率) data model / Used to preprocess large datasets for improved IGV performance.</li>
<li>conversion : using anxiliary(辅助设备) package ‘igvtools’</li>
</ul>
</li>
<li><p>NGS data : </p>
<ul>
<li><p>zoomed all the way out :  only a bar chart of the read coverage</p>
</li>
<li><p>zoomed in : the individual read alignments come into view and displayed as                        horizontal bars , dynamically computes the read coverage</p>
</li>
<li><p>zoomed further : reveal the individual read bases</p>
</li>
</ul>
</li>
<li><p>Variant calls</p>
<p>viewing variants stored in the VCF format.</p>
</li>
<li><p>Copy number and expression data </p>
<p>These data types are displayed as heatmaps(热图) by default.</p>
<ul>
<li><p>expression data : require special trearment as the expression values are usually not specified in genomic coordinates(坐标) but associated with gene names or chip probe identifiers.</p>
<p>secial treatment : these data must be mapped to genomic locations prior(优先) to display.</p>
</li>
</ul>
<blockquote>
<p>热图（Heat Map）以表格的形式展现数据，使用自定义的颜色来标识高、中、低。</p>
</blockquote>
</li>
<li><p>Genomic annotations</p>
<p>Visual representation of annotations follows many of the conventions introduced by the UCSC Genome Browser. </p>
</li>
<li><p>Sample attributes</p>
<p>Tracks(不同的轨道，显示不同信息) can be annotated with metadata by loading a tab-delimited sample information file.</p>
<p>Metadata is displayed as a color-coded matrix in the attribute panel. Each column in the matrix corresponds to a specific attribute, and colors are used to distinguish different values of that attribute.colors are assigned explicitly by the user or chosen automatically by IGV.</p>
</li>
</ul>
<h3 id="igv如何进行大文件可视化的高性能？"><a href="#igv如何进行大文件可视化的高性能？" class="headerlink" title="igv如何进行大文件可视化的高性能？"></a>igv如何进行大文件可视化的高性能？</h3><h4 id="对于文件规模过大的问题："><a href="#对于文件规模过大的问题：" class="headerlink" title="对于文件规模过大的问题："></a>对于文件规模过大的问题：</h4><ul>
<li><p>采用了一种预先计算在多个分辨率比例下的数据汇总，将渲染数据推迟到了执行时间。（‘data tiling’：we adopted a different approach that is based on <strong>precomputing summarizations of data at multiple resolution scales</strong>, with rendering of the data deferred to runtime.）</p>
</li>
<li><p>data tiling的实现方式：主要的大致思路就是将每个分辨率比例（也就是缩放程度）都分成一个tile，比如说第一个缩放比例下显示的就是一个展示整个基因组的tile，接下来的就是一个tile展示每个染色体chromosome（<em>为什么会有染色体的概念，像bam文件不就是一个一个的read吗？</em>），然后是两个tile为每个染色体，下一个就是4个tiles为每个染色体···，每个tile又细分为‘bins’，被选择bin的宽度与在该分辨率级别下由屏幕像素表示的近似基因组宽度相对应。每个bin的数值是由根本的基因组数据的总结出的统计特征。（For each resolution scale(‘zoom level’), the genome is divided into tiles that correspond to the region viewable on the screen of a typical user display. The first zoom level consists of a single tile that covers the entire genome. The next zoom level contains a single tile for each chromosome. The number of tiles then increases by a factor of 2 for each level, so the next zoom level consists of two tiles per chromosome, then four, etc. Each tile is subdivided into ‘bins’, with the width of a bin chosen to correspond to the approximate genomic width represented by a screen pixel at that resolution scale. The value of each bin is calculated from the underlying genomic data with a summary statistic, such as ‘mean’, ‘median’ or ‘maximum’.  ）</p>
</li>
<li><p>实际的问题以及采用的解决方法：对大的基因组来说需要的zoom levels越多，所需要的tiles的个数成指数型增长，造成很大的预计算开销。igv采用的解决方式是用一种混合的方式，就是预计算只是一些lower-level zoom levels，而将high-resolution tiles的计算不工作（on the fly）。</p>
</li>
</ul>
<h3 id="igv用到的工具（为了提升性能以及进行中间处理）、如何使用的？"><a href="#igv用到的工具（为了提升性能以及进行中间处理）、如何使用的？" class="headerlink" title="igv用到的工具（为了提升性能以及进行中间处理）、如何使用的？"></a>igv用到的工具（为了提升性能以及进行中间处理）、如何使用的？</h3><ul>
<li>igvtools</li>
</ul>
<h3 id="igv怎么实现支持本地数据和远程数据，如果是浏览器怎么实现实时的可视化并且加载大文件的性能不至于太差影响用户体验（exac）？"><a href="#igv怎么实现支持本地数据和远程数据，如果是浏览器怎么实现实时的可视化并且加载大文件的性能不至于太差影响用户体验（exac）？" class="headerlink" title="igv怎么实现支持本地数据和远程数据，如果是浏览器怎么实现实时的可视化并且加载大文件的性能不至于太差影响用户体验（exac）？"></a>igv怎么实现支持本地数据和远程数据，如果是浏览器怎么实现实时的可视化并且加载大文件的性能不至于太差影响用户体验（exac）？</h3><ul>
<li>本地数据：直接通过built-in file browser加载文件在本地文件系统上</li>
<li>远程数据：通过http或FTP协议下载文件然后加载、从服务器加载、entering the URL of a Distributed Annotation System (DAS) feature source</li>
</ul>
<h2 id="ExAC-Browser"><a href="#ExAC-Browser" class="headerlink" title="ExAC Browser"></a>ExAC Browser</h2><h3 id="ExAC"><a href="#ExAC" class="headerlink" title="ExAC :"></a>ExAC :</h3><p>Exome Aggregation Consortium(外显体资料库)</p>
<blockquote>
<p>high-quality exome : protein-coding region</p>
</blockquote>
<h4 id="function"><a href="#function" class="headerlink" title="function"></a>function</h4><p>contains an average of one variant every eight bases(碱基) of the exome, and provides direct evidence for the presence(存在) of widespread(普遍的) mutational(突变的) recurrence(复发). </p>
<ul>
<li>improves variant interpretation(解释) in rare disease</li>
<li>Inferring variant deleteriousness(有毒性) and gene constraint(约束)</li>
</ul>
<p>We have used this catalogue to calculate objective metrics(客观权值) of pathogenicity(致命性) for sequence variants, and to identify genes subject to strong selection against various classes of mutation(突变); identifying 3,230 genes with near-complete depletion of predicted protein-truncating variants, with 72% of these genes having no currently established human disease phenotype. Finally, we demonstrate(证明) that these data can be used for the efficient filtering of candidate(候补) disease-causing variants, and for the discovery of human ‘knockout’(淘汰) variants in protein-coding genes.（可以用来鉴定某个基因是否是致病基因等）</p>
<h3 id="ExAC-Browser-1"><a href="#ExAC-Browser-1" class="headerlink" title="ExAC Browser :"></a>ExAC Browser :</h3><p>将ExAC中的数据以基因Gene、转录transcipt、变异Variant、多等位基因变异Multi-allelic variant、区域Region等这么多的字段以可视化的形式展示出来。可以通过相应生物名词的字段将其检索然后呈现出来对应的类型。</p>
<h4 id="可视化的机制是什么？"><a href="#可视化的机制是什么？" class="headerlink" title="可视化的机制是什么？"></a>可视化的机制是什么？</h4><p>可视化之中，除了显示变异位点信息的可视化部分有很强的interactive，别的部分基本上没什么，就是简单将数据以表格、柱形图的形式展示出来，加上鼠标hover事件。</p>
<p>显示变异位点信息部分：interactive visualization 借用了<code>IGV.js</code>来展示各个read部分，基于变异位点只展示变异位点在此染色体上的左右某个区间内的reads信息，而没有加载整个的染色体的reads部分！</p>
<blockquote>
<h5 id="IGV-js"><a href="#IGV-js" class="headerlink" title="IGV.js :"></a>IGV.js :</h5><p>Embeddable(可嵌入的) genomic visualization component based on the Integrative Genomics Viewer.</p>
<h6 id="Data-server-requirements"><a href="#Data-server-requirements" class="headerlink" title="Data server requirements"></a>Data server requirements</h6><p>servers hosting data(服务器托管数据) for use by jgv.js must support HTTP Range headers(范围请求、部分请求) and Cross-Origin Resource Sharing(CORS)(跨域资源共享)，i.e. apache server and amazon s3 bucket support these requirements ,most populat servers can be configured to support these requirements.</p>
<ul>
<li><p>range request : </p>
<p>for indexed files,the data are retrieved on demand as needed for display.so data web server must support HTTP Range requests in order to allow retrieval of a subset of the data in the file.(这就是前面的只是展示变异位点左右某个区间内的reads信息，部分请求对大文件的数据可视化的展示必不可少。但是部分请求需要在服务器上托管的文件是添加了索引过的文件，所以<code>bam</code>、<code>fasta</code>需要索引文件，<code>bigwig</code>、<code>bigbed</code>包含了可嵌入的索引，对于注释格式像<code>bed</code>、<code>gtf</code>索引可要可不要)</p>
</li>
<li><p>CORS</p>
<p>跨域资源共享简单的说就是出于安全原因浏览器限制脚本内发起的跨源HTTP请求（就是请求的url中的域名、协议、或者端口与所在html页面的URL不相同），也有可能不是限制而是跨站请求可以发起但是返回结果被浏览器拦截，一定要实现跨源HTTP请求就必须使用CORS头文件。</p>
<p>所以igv.js使用CORS也是必要的。</p>
</li>
<li><p>bam mime type(bam文件类型)</p>
<p>In some cases it might be necessary to explicitly(准确) define a mime type for “.bam” files when using an Apache server.</p>
</li>
</ul>
</blockquote>
<h4 id="文件读取的方式是什么（或者说对文件进行了什么处理）？"><a href="#文件读取的方式是什么（或者说对文件进行了什么处理）？" class="headerlink" title="文件读取的方式是什么（或者说对文件进行了什么处理）？"></a>文件读取的方式是什么（或者说对文件进行了什么处理）？</h4><p>从igv.js的要求中可以看出文件的加载主要通过网络请求从服务器上加载数据，具体的细节部分没有看到。</p>
<h2 id="NGS流程"><a href="#NGS流程" class="headerlink" title="NGS流程"></a>NGS流程</h2><ul>
<li><p>首先经过测序机我们得到了<code>fastq.gz</code>文件，这里是多条DNA链进行复制然后打断形成很多的read片断，不去区分哪个read片断属于哪条链，我们需要的只是与reference进行比对分析，找出变异位点。这个点就涉及在可视化的方面展示的问题，很多的read是怎么排列起来的，一个参考序列的某个部分有很多个read在下面有序按起始先后排列（按先后顺序就是基于每个read的<code>alignment start</code>字段），这就是多条DNA链复制之后的结果，没有区分哪条链就是说排列在一行的并不是一条链的意思，而是由当前窗口大小决定。</p>
</li>
<li><p>经过<code>BWA</code>工具将<code>fastq</code>与<code>fasta</code>进行比对得到输出的比对结果<code>sam</code>文件，保存每个碱基对在参考序列上的位置是通过<code>alignment start</code>与<code>cigar</code>字段。</p>
</li>
<li><p>得到的<code>sam</code>文件经过<code>samtools compress</code>压缩成<code>bam</code>格式文件。</p>
</li>
<li><p><code>bam</code>文件经过<code>samtools sort</code>成<code>_sort.bam</code>文件，这个过程就是之前说的基于<code>alignment start</code>字段进行read的排列过程。</p>
</li>
<li><p>然后就经过<code>PICARD markDuplicate</code>进行去重，这个过程就是对read段进行判断，如果<code>alignment start</code>和<code>alignment end</code>完全相同，就认为这两个是重复的。</p>
</li>
<li><p>去重之后的<code>bam</code>文件再经过<code>GATK localRealignment</code>进行read部分碱基对的微调，就是有什么碱基对的缺失什么的可以进行重新排列。</p>
</li>
<li><p>然后再进行<code>GATK BQSR</code>进行read部分碱基对的quality参数的修正，因为出来的是测序机的不同通道，还有可能的就是位置上的差异导致的判断的误差，对此进行修正。</p>
</li>
<li><p>最后就是进行<code>GATK mutect2</code>等工具将之前一系列操作得到的<code>.bam</code>文件转化为表示变异位点信息的<code>.vcf</code>文件。</p>
</li>
</ul>
<h2 id="vcftools"><a href="#vcftools" class="headerlink" title="vcftools"></a>vcftools</h2><h3 id="VCF"><a href="#VCF" class="headerlink" title="VCF"></a>VCF</h3><p>variant call format(这么一个格式，不知道怎么翻译，只知道它是用来表示变异位点信息的)</p>
<p>能够表示多种基因变异，例如单核苷酸多态性(snps)，插入和删除(indels)，结构变异(structural variants)等等。</p>
<ul>
<li><p>header section</p>
<ul>
<li><p>arbitrary number(任意数量的) of metainformation(元信息) lines，each starting with character <code>##</code></p>
<p>providing a standardized description of tags and annotations used in the data section （提供在数据部分中标准化标志和注释描述）</p>
<p>consisting : </p>
<ul>
<li>means of file creation</li>
<li>date of creation</li>
<li>version of the reference sequence</li>
<li>sofware used</li>
<li>relevant to the history of the file </li>
</ul>
</li>
<li><p>TAB delimited field definition line(相当于表格中的表头行),starting with a single <code>#</code> character</p>
<p>names eight mandatory(强制的，必须的) columns , correspinding to data columns</p>
<p>they are :</p>
<ul>
<li>CHROM : the chromosome</li>
<li>POS : a 1-based position of the start of the variant</li>
<li>ID : unique identifiers(标志符) of the variant </li>
<li>REF : the reference allele</li>
<li>ALT : a comma(逗号) separated list of alternate(替换物) non-reference alleles</li>
<li>QUAL : a phred-scaled quality score(phred-scaled不知道什么东西的质量得分)</li>
<li>FILTER : site filtering(过滤) information</li>
<li>INFO : a semicolon(分号) separated list of additional,user extensible(可扩展) annotation</li>
</ul>
<p>optional : when samples are present in the file,the mandatory header columns are followed by </p>
<ul>
<li>FORMAT column : used to define the information contained within each subsequent(随后的) genotype column,which consists of a colon(冒号) separated list of fields(字段). </li>
<li>an arbitrary number of sample IDs that define the samples in the VCF file</li>
</ul>
</li>
</ul>
</li>
<li><p>data section </p>
</li>
</ul>
<h3 id="VCFtools"><a href="#VCFtools" class="headerlink" title="VCFtools"></a>VCFtools</h3><p>an open-source software package for parsing(语法分析、剖析), analyzing and manipulating(操作) VCF files.</p>
<p>可以按位点信息、碱基位置等信息筛选特定条件下的变异信息，也可以比较两个vcf文件之间变异信息的差异，也可以处理vcf文件，输出ped和plink格式文件等</p>
<h4 id="software-suite-软件集-：（split-into-two-modules"><a href="#software-suite-软件集-：（split-into-two-modules" class="headerlink" title="software suite(软件集)：（split into two modules)"></a>software suite(软件集)：（split into two modules)</h4><ul>
<li><p>The first module provides a general Perl API, and allows various operations to be performed on VCF files, including format validation(确认), merging, comparing, intersecting, making complements(互补品、其余产品) and basic overall(全体的) statistics. </p>
<p>主要是VCF文件操作</p>
</li>
<li><p>The second module consists of C++ executable(可执行文件) primarily used to analyze SNP data in VCF format, allowing the user to estimate allele frequencies, levels of linkage disequilibrium(连锁不平衡) and various Quality Control metrics(度量). </p>
<p>主要进行常规的分析</p>
</li>
</ul>
<h4 id="use-it"><a href="#use-it" class="headerlink" title="use it"></a>use it</h4><ul>
<li>before use : require that VCF files are compressed by bgzip and indexed by tabix. (both tools are part of the tabix package)</li>
<li>无论是PEAL API 还是二进制可执行文件都是通过命令行的方式，在终端输入命令进行相应的操作。</li>
</ul>
<h2 id="consortium"><a href="#consortium" class="headerlink" title="consortium"></a>consortium</h2><h3 id="icgc"><a href="#icgc" class="headerlink" title="icgc"></a>icgc</h3><p>癌症基因组的数据集</p>
<p>可视化展示平台（<code>https://dcc.icgc.org/</code>）：包含癌症基因组数据集可视化、分析和下载，其中release部分大部分都包含这几类文件：</p>
<ul>
<li>Cancer Projects</li>
<li>Cancer primary sites</li>
<li>Donors(捐献者) with molecular(分子的) data in DCC</li>
<li>Total Donors</li>
<li>Simple Somatic Mutations(体细胞突变)</li>
<li>Mutated genes</li>
</ul>
<h3 id="impc"><a href="#impc" class="headerlink" title="impc"></a>impc</h3><p>knockout mouse strains(基因剔除小鼠品系)</p>
<p>对老鼠的表现型进行分析，统计学的分析和疾病关联，终端用户将可以通过<code>http://www.mousephenotype.org</code>来获取数据分布：统计关联性、疾病关联性、表型图片、自杀基因表达等</p>
<p>获取数据的方式就是通过命令行的方式使用各种API获取数据（就是加上在命令中加入各种参数之类的）</p>
<p>数据获取链接：<code>http://www.mousephenotype.org/data/documentation/data-access</code></p>
<h3 id="giab"><a href="#giab" class="headerlink" title="giab"></a>giab</h3><p>to develop the technical infrastructure(基础设施) (reference standards, reference methods, and reference data) to enable translation of whole human genome sequencing to clinical practice. </p>
<h3 id="ENCODE-Project-consortium"><a href="#ENCODE-Project-consortium" class="headerlink" title="ENCODE Project consortium"></a>ENCODE Project consortium</h3><p>encyclopedia(百科全书) of dna elements consortium</p>
<p>The goal of ENCODE is to build a comprehensive parts list of functional elements in the human genome, including elements that act at the protein and RNA levels, and regulatory(管理的) elements that control cells and circumstances(情况) in which a gene is active.</p>
<p>(在好多种数据集中)数据的获取：Data Coordination(协调) Center</p>
<h3 id="seven-bridges"><a href="#seven-bridges" class="headerlink" title="seven bridges"></a>seven bridges</h3><p>deliver end-to-end(端到端) bioinformatic solutions — including <strong>access to datasets</strong>, analytic workflows and algorithms, cloud-computing infrastructure(基础设施), and scientific support — that speed the path from raw experimental data to new treatments and diagnostics.</p>
<p>该平台托管大型基因组数据集以及查询，过滤和浏览它们的工具。</p>
<h4 id="seven-bridges-genome-browser"><a href="#seven-bridges-genome-browser" class="headerlink" title="seven bridges genome browser"></a>seven bridges genome browser</h4><ul>
<li>Visualize alignment files</li>
<li>Check coverage and mismatch proportion at base level</li>
<li>Assess alignments and variants for quality control</li>
<li>Visualize SNV/Indels and compare these to a supported reference</li>
<li>Visualize annotation tracks to interpret your data</li>
<li>Annotate your own data privately, and share comments on data with your collaborators</li>
</ul>
<p>data：bring and store your own files /use data from our public genomic reference files/easily query and use data from our large,punlicly-hosted genomics dataset.</p>
<h2 id="CNV-detection"><a href="#CNV-detection" class="headerlink" title="CNV detection"></a>CNV detection</h2><h3 id="cnvs"><a href="#cnvs" class="headerlink" title="cnvs :"></a>cnvs :</h3><p>copy number variants 基因拷贝数变异</p>
<ul>
<li><p>a phenomenon in which sections of the genome are repeated and the number of repeats in the genome varies between individuals in the human population.部分基因重复并且基因的重复在人类群体中个体之间有一定的差异的现象</p>
</li>
<li><p>a type of <a href="https://en.wikipedia.org/wiki/Structural_variation" target="_blank" rel="noopener">structural variation</a>: specifically, it is a type of <a href="https://en.wikipedia.org/wiki/Gene_duplication" target="_blank" rel="noopener">duplication</a> or <a href="https://en.wikipedia.org/wiki/Deletion_(genetics" target="_blank" rel="noopener">deletion</a>) event that affects a considerable number of base pairs.它是一种结构变异，更具体一点是一种由复制或者删除事件影响到了相当多数目的碱基对。</p>
</li>
</ul>
<h3 id="CoNIFER"><a href="#CoNIFER" class="headerlink" title="CoNIFER"></a>CoNIFER</h3><p>copy number inference from exome reads </p>
<h4 id="功能："><a href="#功能：" class="headerlink" title="功能："></a>功能：</h4><p>uses exome sequencing data to find copy number variants (CNVs) and genotype the copy-number of duplicated genes.(使用外显子序列数据来找到基因拷贝数变异和把复制基因的拷贝数进行基因分型，在外显子数据中基于序列的测序深度对稀少的基于拷贝数变异探查)</p>
<p>可以从许多的实验的流量通过测量一批偏差来混合外显子序列(CoNIFER offers the ability to mix exome sequence from multiple experimental runs by eliminating batch biases.)</p>
<h4 id="使用："><a href="#使用：" class="headerlink" title="使用："></a>使用：</h4><p>输入文件是<code>bam</code>文件，但是在处理过程中要进过多次处理，包括文件转换（用到各种其他的包），也有中间文件格式。整体是一个基于python的程序，采用的方式就是下载之后安装然后以命令行的方式运行程序从而执行各项需要的功能。</p>
<h3 id="XHMM"><a href="#XHMM" class="headerlink" title="XHMM"></a>XHMM</h3><p>eXome-Hidden Markov Model(隐藏外显子组马尔科夫模型)</p>
<h4 id="功能：-1"><a href="#功能：-1" class="headerlink" title="功能："></a>功能：</h4><p>to detect and genotype copy number variation (CNV) from normalized read-depth data from targeted sequencing experiments（和CoNFER的功能大致相同，也是探测基因拷贝数变异和将基因拷贝数进行基因分型）</p>
<p>uses principal component analysis (PCA)(主成分分析法) normalization(正常化) and a hidden Markov model (HMM) </p>
<h4 id="使用：-1"><a href="#使用：-1" class="headerlink" title="使用："></a>使用：</h4><p>首先得安装，除了自身的程序之外，还需要GATK和PLIKE/Seq，文件需要Human reference genome files(人类参考基因组)和基因组bam文件。</p>
<h3 id="Codex"><a href="#Codex" class="headerlink" title="Codex"></a>Codex</h3><p>COpy number Detection by EXome sequencing </p>
<h4 id="功能"><a href="#功能" class="headerlink" title="功能"></a>功能</h4><p>a normalization and copy number variation detection method for whole exome sequencing</p>
<p>为整个外显子组做标准化和基因拷贝数变异的检测，和之前的两个工具相比有相同的功能。</p>
<p>relies on the availability of multiple samples processed(处理的) using the same sequencing pipeline for normalization, and does not require matched controls.依靠在通过使用相同的排序管道进行标准化处理许多样本的有效性，并且不需要相匹配的控制（控件）。</p>
<h4 id="使用"><a href="#使用" class="headerlink" title="使用"></a>使用</h4><p>依靠R语言的package</p>
<p>direct input:</p>
<ul>
<li>bamdir , which is a vector(载体，向量) indicating the directories of all .bam files;</li>
<li>sampname, which is a column(列) vector with row(行) entries of sample names;</li>
<li>bedFile, which indicates the directory of the .bed file (WES bait file, no header, sorted by start and end positions);</li>
<li>chr, which specifies the chromosome.</li>
</ul>
<h2 id="data-sharing"><a href="#data-sharing" class="headerlink" title="data sharing"></a>data sharing</h2><h3 id="DECIPHER"><a href="#DECIPHER" class="headerlink" title="DECIPHER"></a>DECIPHER</h3><p>DatabasE of genomiC varIation and Phenotype in Humans using Ensembl(全体) Resources</p>
<p>database for the interpretation of phenotype-linked plausibly pathogenic sequence and copy-number variation (一个为了解释表型连接的有可能是致病序列和基因拷贝数变异的数据库)</p>
<p>enhances clinical diagnosis(临床诊断) by retrieving(检索) information from a variety of bioinformatics(生物学信息) resources relevant to the variant found in the patient. The patient’s variant is displayed in the context of both normal variation and pathogenic variation reported at that locus thereby facilitating interpretation.</p>
<h4 id="download"><a href="#download" class="headerlink" title="download :"></a>download :</h4><p> data : All files are in GRCh37/hg19 human genome assembly(组装)</p>
<blockquote>
<p>GRC : Genome Reference Consortium（基因组参照序列联盟）</p>
<p>h37 : Human Genome Build 37,即人类基因组组装（Assembly）的版本37</p>
<table><br>    <tr><br>        <th>NCBI</th><br>        <th>UCSC</th><br>        <th>对应ENSEMBL数据库</th><br>    </tr><br>    <tr><br>        <td>GRCh36</td><br>        <td>hg18</td><br>        <td>ENSEMBL release_52</td><br>    </tr><br>    <tr><br>        <td>CRCh37</td><br>        <td>hg19</td><br>        <td>ENSEMBL release_59/61/64/68/69/75</td><br>    </tr><br>    <tr><br>        <td>CRCh38</td><br>        <td>hg38</td><br>        <td>ENSEMBL  release_76/77/78/80/81/82</td><br>    </tr><br></table>

<p>可以看到ENSEMBL的版本特别复杂！！！很容易搞混！</p>
<p>但是UCSC的版本就简单了，就hg18,19,38, 常用的是hg19，</p>
<p>看起来NCBI也是很简单，就GRCh36,37,38。</p>
</blockquote>
<p>haploinsufficiency prediction  (.bed.gz) 单倍剂量不足（一个等位基因突变后,另一个等位基因能正常表达,但这只有正常水平50%的蛋白质不足以维持细胞正常的生理功能.）预测<br>population copy-number variation frequencies (.txt.gz) 种群？<br>development disorder genotype-phenotype detabase(.csv.gz)</p>
<h4 id="browser"><a href="#browser" class="headerlink" title="browser"></a>browser</h4><ul>
<li><p>genome browser : 可以通过position,gene name,chromesome band name 来展示一个可视化交互的页面，通过<a href="https://wtsi-web.github.io/Genoverse/" target="_blank" rel="noopener">genoverse</a>实现。</p>
<blockquote>
<p><strong>genoberse</strong> :</p>
<p>a portable(轻便的), customizable(可定制的), back-end independent JavaScript and HTML5 based genome browser which allows the user to explore data in a dynamic and interactive manner.</p>
<p>works with a variety of formats, such as XML, JSON, BED, VCF, GFF, GFF3 or delimited text files, and can be customized to parse(解析) and display any data source as required.</p>
</blockquote>
</li>
<li><p>phenotype browser ：统计带有各种异常表型（Phenotypic abnormality）的病人的数量并以弦图的方式展示出来 </p>
</li>
<li><p>genes</p>
</li>
<li><p>cnv syndromes</p>
</li>
<li><p>gene disorders</p>
</li>
</ul>
<h3 id="Cafe-variome"><a href="#Cafe-variome" class="headerlink" title="Cafe variome"></a>Cafe variome</h3><p>数据发现的工具(data discovery tool)，是为那些生物医学数据拥有者想让他们的数据可以被其他人通过网络或者更广的科学社区所发现。</p>
<p>Cafe Variome software is provided as a compressed archive(档案文件) containing the entire Web application.</p>
<h3 id="GeneMatcher"><a href="#GeneMatcher" class="headerlink" title="GeneMatcher"></a>GeneMatcher</h3><p>Online platform designed to connect clinicians(临床医生) and researchers from around the world who share an interest in the same gene or genes.</p>
<p>The site allows individuals to post a gene (or genes) of interest and will connect individuals who post the same gene. Users create an account and submit gene(s) of interest (by gene symbol or base pair position). Users have the option, though are not required, to provide a variant (or variants) (by base pair position), diagnosis based upon OMIM® number, as well as to submit clinical features of the patient/family and add that to the matching criteria. The match is done automatically. When a match occurs, the submitters will automatically receive email notification. Follow-up(后续) is at the discretion(判断力，判定) of the submitters. It is also possible to query other Matchmakers (see <a href="https://www.matchmakerexchange.org/" target="_blank" rel="noopener">MatchmakerExchange.org</a>) to see if they contain matches. Upon entry to the site, the submitter will be prompted to(被提升到) select the database(s) and matching criteria. If a match is not identified at the time of submission, the genes of interest will continue to be queried by new entries. Genes or gene lists may also be left on the site even after a match has been identified.(简单的说就是用户可以上传其所感兴趣的基因然后会网站会联系上传了同一基因的人，上传基因有很多选项比如说变异或者病因诊断等等。这个匹配过程是自动的，当匹配成功上传人就会收到邮件，根据网站提供的入口，上传人就可以选择数据库和匹配的条件。如果一个匹配没有被提交所鉴定，感兴趣的基因将继续被新的入口可查询。即使已经被识别还是会被放在网站上。)</p>
<p>Users may not access the full database, and may only search or view the data linked to their own account.（用户只能使用他们自己的数据，没有整个数据库的使用权，只可以搜索或者观察与他们账户相关的数据）。</p>
<h3 id="RD-connect"><a href="#RD-connect" class="headerlink" title="RD-connect"></a>RD-connect</h3><p>An Integrated(综合性) Platform Connecting Databases, Registries(登记注册), Biobanks(生物信息库) and Clinical Bioinformatics for Rare Disease Research（功能看起来和之前的类似，这类data sharing工具都是这样）</p>
<h3 id="PhenomeCentral"><a href="#PhenomeCentral" class="headerlink" title="PhenomeCentral"></a>PhenomeCentral</h3><p>PhenomeCentral identifies(鉴别) similar patients in the database based on semantic(语义的) similarity between clinical features, automatically prioritized(优先考虑) genes from whole‐exome data, and candidate genes entered(登记) by the users, enabling both hypothesis‐free and hypothesis‐driven(假设驱动) matchmaking.（可以基于临床表征的语义相似性在数据库中鉴别相类似的病人，优先考虑从整个外显子组数据中的基因）</p>
<h3 id="MatchMaker-Exchange"><a href="#MatchMaker-Exchange" class="headerlink" title="MatchMaker Exchange"></a>MatchMaker Exchange</h3><p>provide a robust(强健的) and systematic(系统的) approach to rare disease gene discovery through the creation of a federated(联合的) network connecting databases of genotypes and rare phenotypes using a common application programming interface (API) （为罕见疾病基因的发现提供强健的和系统的方法通过使用常见的API建立基因型和罕见表型的数据库的联合网络连接）</p>
<h2 id="phenotype"><a href="#phenotype" class="headerlink" title="phenotype"></a>phenotype</h2><h3 id="phenotips"><a href="#phenotips" class="headerlink" title="phenotips"></a>phenotips</h3><p>collecting and analyzing phenotypic(表型的) information of patients with genetic disorders(家族遗传性疾病).</p>
<p>大概就是这个软件可以将家传遗传性疾病图谱以图形的方式呈现出来。</p>
<h3 id="PhenoDB"><a href="#PhenoDB" class="headerlink" title="PhenoDB"></a>PhenoDB</h3><p>a robust, useful database for collection, storage, and analysis of phenotypic data</p>
<p>一个强健的有用的数据库用来收集、存储和分析表型数据</p>
<h3 id="Phenominer"><a href="#Phenominer" class="headerlink" title="Phenominer"></a>Phenominer</h3><p>Online database of phenotypes and associated disorders(主要就是一个建立了表型和与之相对的疾病的在线数据库，数据库可以下载。)</p>
<p>PhenoMiner is a research project aimed at the capture and encoding of phenotypes in the scientific literature(科学文献).The PhenoMiner Portal provides a way to search the vocabulary of terms we extracted(提取) from the scientific literature. The system that does this extraction is based on text/data-mining technology - natural language processing, machine learning and conceptual analysis.</p>
<h2 id="data-compression"><a href="#data-compression" class="headerlink" title="data compression"></a>data compression</h2><h3 id="sam-bam-cram"><a href="#sam-bam-cram" class="headerlink" title="sam,bam,cram"></a>sam,bam,cram</h3><h4 id="sam-bam-format"><a href="#sam-bam-format" class="headerlink" title="sam/bam format"></a>sam/bam format</h4><p><code>bam</code>文件和<code>sam</code>文件基本上是同样的内容，差别在于<code>sam</code>文件是人看得懂的，而<code>bam</code>文件是压缩成二进制的文件。SAM的档案架构分两部分，一部分是Header，另一部分是Alignment。外观如下图：</p>
<p><img src="https://weitinglindotcom.files.wordpress.com/2016/01/screenshot24.png?w=620" alt="representation:sam/bam format"></p>
<p>Header的部分位在整个档案的一开头，里面会有这份档案的基本资料，如有无sorted过、里头使用的reference及其长度、使用过什么工具处理和alignment时所下的指令。Header里头会有几个关键组成：＠SQ/ @RG/ @PG，分别代表以下的意义</p>
<p>＠SQ 这个开头含有做alignment所使用的reference序列资讯，SN代表的是åsequence name，而紧跟者为LN，代表此参考序列的长度</p>
<p>@RG代表者read group的资讯和sample的基本资料，部分软体会根据里头的ID，去辨识有无batch effect，有的会有PL指标去代表其所使用的定序平台</p>
<p>@PG里头含有此次alignment所使用的程式资讯，CL指标后面有所下的指令，VL后面有使用的软体版本资讯</p>
<p>Alignment的部分就是实际一个个read的alignment资料，一行为一个reads，其用以下的资讯来说明每个reads的状况：</p>
<p><img src="https://weitinglindotcom.files.wordpress.com/2016/01/screenshot25.png?w=620" alt="alignment"></p>
<p>依序是每个Reads独一无二的编号，通常还有在定序机器上的位置，FLAG，则是代表此reads的mapping状态，chr则很明显的是对到的染色体，Start代表reads对应到reference上的开始位置，CIGAR则代表这个reads每一个位置跟reference mapping的状态，是否100Match到，后面还有一些进阶资讯，假如是paired-read则会显示他的mate reads的位置。</p>
<h4 id="cram"><a href="#cram" class="headerlink" title="cram"></a>cram</h4><p>但是现在<code>bam</code>文件还是太大，所以再进行进一步压缩成<code>cram</code>文件，压缩的机制就是以reference上的相对位置和改变的资讯来存储（所以需要固定的reference版本），而并非是存储read序列。</p>
<p><img src="https://weitinglindotcom.files.wordpress.com/2016/01/screenshot21.png" alt="compress"></p>
<h3 id="tabix"><a href="#tabix" class="headerlink" title="tabix"></a>tabix</h3><p>只是一个用来为已排序文件添加索引和查询的工具，支持的已排序文件的格式有：<code>gff</code>,<code>bed</code>,<code>sam</code>,<code>vcf</code>，并且这些文件都需要用<code>bgzip</code>压缩成<code>.gz</code>的格式。会生成相应的索引文件，其格式为<code>.tab.bgz.tbi</code>或者<code>.tab.bgz.csi</code>。</p>
<p>samtools 主要用来处理<code>sam</code>,<code>bam</code>,<code>cram</code>格式的文件，排序、添加索引、观察之类的。</p>
<h3 id="gqt"><a href="#gqt" class="headerlink" title="gqt"></a>gqt</h3><p>主要支持<code>vcf</code>,<code>vcf.gz</code>,<code>bcf</code>文件，<code>bcf</code>是二进制版本的<code>vcf</code>文件，他们之间的关系与<code>bam</code>文件和<code>sam</code>文件类似，<code>bcf</code>文件存取的信息与<code>vcf</code>文件相同。</p>
<ul>
<li><p>convert    Convert between file types(可以用来添加gqt索引)</p>
</li>
<li><p>query      Query the index</p>
</li>
<li><p>pca-shared Compute the similarity matrix for PCA base on the number of shared non-reference loci.</p>
</li>
<li><p>calpha     Calculate C-alpha paramters (Neal 2011)</p>
</li>
<li><p>gst        Calculate Gst statistic (Neil 1973)</p>
</li>
<li><p>fst        Calculate Fst statistic (Weir and Cockerham 1984)</p>
</li>
</ul>
<p>bcftools 主要用于<code>vcf</code>,<code>bcf</code>文件，主要功能包括添加索引，对文件进行操作（注释、转换、观察、排序等等），或者进行分析（主要有snp/indel calling,过滤等等）。可以实现bcf与vcf文件之间的转换：</p>
<pre><code>bcftools view view.vcf.gz -O u -o view.bcf
-O参数指定输出文件的类型，b代表压缩后的BCF文件，u代表未经压缩的BCF文件，z代表压缩后的VCF文件，v代表未经压缩的VCF文件；-o参数指定输出文件的名字。
</code></pre><h2 id="cloud-tools"><a href="#cloud-tools" class="headerlink" title="cloud tools"></a>cloud tools</h2><h3 id="VAT"><a href="#VAT" class="headerlink" title="VAT"></a>VAT</h3><p>主要涉及到的文件格式有两个：vcf格式和interval格式。</p>
<p>The Interval format consists of eight tab-delimited columns and is used to represent genomic intervals such as genes. (包含有8个tab字符分隔的列，用来表示基因组间隔)</p>
<p>For the purpose of VAT, the name field in the Interval file must contain four pieces of information delimited by the ‘|’ symbol (geneId|transcriptId|geneName|transcriptName). Using the gencode2interval program ensures proper formatting.(interval文件的命名要求)</p>
<h4 id="core-modules"><a href="#core-modules" class="headerlink" title="core modules"></a>core modules</h4><ul>
<li>snpMapper : annotate a set of     SNPs in VCF format.Takes a VCF input from STDIN(标准输入),Outputs annotated SNPs in VCF format. The annotation information is captured as part of the INFO field(vcf格式下头部有INFO字段). </li>
<li>indelMapper ： annotate a set of indels in VCF format. Outputs annotated indels in VCF format. The annotation information is captured as part of the INFO field. </li>
<li>svMpper ：annotate a set of SVs in VCF format. Outputs annotated SVs in VCF format. The annotation information is captured as part of the INFO field. </li>
<li>genericMapper ：annotate a number of different variants in VCF format. Outputs the annotated variants in VCF format. The annotation information is captured as part of the INFO field.</li>
<li>vcfSummary ： aggregate(聚集、集合) annotated variants across genes and samples.Generates two output files. The first file, named <code>file.geneSummary.txt</code>, contains the number of variants categorized(分类) by type for each gene. A second file, named <code>file.sampleSummary.txt</code>, summarizes number of variants categorized by type for each sample.</li>
<li>vcfImages ：generate an image for each gene to visualize effect of the annotated variants.Generates an image in PNG format for each gene that has at least one annotated variant.</li>
<li>vcfSubsetByGene ：subset a VCF file with annotated variants by gene.Generates a VCF file for each gene that has at least one annotated variant.(subset能否理解为为每个基因划分为单独的vcf文件)</li>
<li>vcfModifyHeader ：modify(修改) the header line (part of the meta-lines) in a VCF file. Generates a VCF header file.</li>
</ul>
<p>以上这些命令都是通过CLI执行。前面几个添加注释的命令大都包含annotation.interval和annotation.fa做参数。</p>
<h4 id="web端"><a href="#web端" class="headerlink" title="web端"></a>web端</h4><p>web端做的就是一个将所有处理所得结果进行可视化的一个步骤，在本地进行文件的相应处理之后，就可以将相关文件打包上传到安装了vat的web browser页面之上，上传的打包压缩文件包括</p>
<ul>
<li>Directory with the images and the VCF files for each gene (ALL.2of4intersection.20100804.chr22)</li>
<li>File with the gene summary (ALL.2of4intersection.20100804.chr22.geneSummary.txt)</li>
<li>File with the sample summary (ALL.2of4intersection.20100804.chr22.sampleSummary.txt)</li>
<li>Compressed VCF file with the annotated variants (ALL.2of4intersection.20100804.chr22.vcf.gz)</li>
<li>Index file of the annotated variants (ALL.2of4intersection.20100804.chr22.vcf.gz.tbi)</li>
</ul>
<p>注释文件可以在exac上下载，ExAC.r0.3.sites.vep.vcf.gz就是典型的使用vep(基于Perl的对遗传变异信息提供更具体的注释)进行注释的。</p>
<h3 id="mercury"><a href="#mercury" class="headerlink" title="mercury"></a>mercury</h3><p>an automated approach that integrates(使…成为一体) multiple sequence analysis components across many computational steps, from obtaining patient samples to providing a fully annotated list of variant sites for clinical applications.(这个工具使从掌握病人样本到提供一整个添加了注释过的变异列表的整个过程的计算步骤成为一个整体)</p>
<h4 id="workflow"><a href="#workflow" class="headerlink" title="workflow"></a>workflow</h4><p>Source information includes sample and project management data and the characteristics of library preparation and sequencing. (原始输入信息包括样本和工程数据和一些特征库的准备和序列)</p>
<ul>
<li>makeing reads and qualities</li>
<li>mapping</li>
<li>merging and bam finishing</li>
<li>variant calling</li>
<li>annotaion and data delivery</li>
</ul>
<h4 id="cloud-workflow-and-local-workflow"><a href="#cloud-workflow-and-local-workflow" class="headerlink" title="cloud workflow and local workflow"></a>cloud workflow and local workflow</h4><p><em>Mercury</em> has been instantiated in the cloud via the DNAnexus platform (utilizing AWS’s EC2 and S3).</p>
<p>After sample data are uploaded to the DNAnexus environment, the workflow is instantiated in the cloud.</p>

      
    </div>
    
    
    

    

    

    

    <footer class="post-footer">
      

      
      
      

      
        <div class="post-nav">
          <div class="post-nav-next post-nav-item">
            
              <a href="/2018/12/14/read-node.jsday1/" rel="next" title="【读书笔记】Node_Foundation">
                <i class="fa fa-chevron-left"></i> 【读书笔记】Node_Foundation
              </a>
            
          </div>

          <span class="post-nav-divider"></span>

          <div class="post-nav-prev post-nav-item">
            
              <a href="/2018/12/20/graduation-thesis-of-undergraduate/" rel="prev" title="related work for graduation thesis of undergraduate">
                related work for graduation thesis of undergraduate <i class="fa fa-chevron-right"></i>
              </a>
            
          </div>
        </div>
      

      
      
    </footer>
  </div>
  
  
  
  </article>



    <div class="post-spread">
      
    </div>
  </div>


          </div>
          


          

  
    <div class="comments" id="comments">
    </div>
  



        </div>
        
          
  
  <div class="sidebar-toggle">
    <div class="sidebar-toggle-line-wrap">
      <span class="sidebar-toggle-line sidebar-toggle-line-first"></span>
      <span class="sidebar-toggle-line sidebar-toggle-line-middle"></span>
      <span class="sidebar-toggle-line sidebar-toggle-line-last"></span>
    </div>
  </div>

  <aside id="sidebar" class="sidebar">
    
    <div class="sidebar-inner">

      

      
        <ul class="sidebar-nav motion-element">
          <li class="sidebar-nav-toc sidebar-nav-active" data-target="post-toc-wrap">
            文章目录
          </li>
          <li class="sidebar-nav-overview" data-target="site-overview-wrap">
            站点概览
          </li>
        </ul>
      

      <section class="site-overview-wrap sidebar-panel">
        <div class="site-overview">
          <div class="site-author motion-element" itemprop="author" itemscope="" itemtype="http://schema.org/Person">
            
              <p class="site-author-name" itemprop="name">renhao</p>
              <p class="site-description motion-element" itemprop="description"></p>
          </div>

          <nav class="site-state motion-element">

            
              <div class="site-state-item site-state-posts">
              
                <a href="/archives/">
              
                  <span class="site-state-item-count">94</span>
                  <span class="site-state-item-name">日志</span>
                </a>
              </div>
            

            
              
              
              <div class="site-state-item site-state-categories">
                <a href="/categories/index.html">
                  <span class="site-state-item-count">19</span>
                  <span class="site-state-item-name">分类</span>
                </a>
              </div>
            

            

          </nav>

          
            <div class="feed-link motion-element">
              <a href="/atom.xml" rel="alternate">
                <i class="fa fa-rss"></i>
                RSS
              </a>
            </div>
          

          

          
          

          
          

          

        </div>
      </section>

      
      <!--noindex-->
        <section class="post-toc-wrap motion-element sidebar-panel sidebar-panel-active">
          <div class="post-toc">

            
              
            

            
              <div class="post-toc-content"><ol class="nav"><li class="nav-item nav-level-2"><a class="nav-link" href="#IGV"><span class="nav-number">1.</span> <span class="nav-text">IGV</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#features"><span class="nav-number">1.1.</span> <span class="nav-text">features</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#支持的输入数据格式："><span class="nav-number">1.2.</span> <span class="nav-text">支持的输入数据格式：</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#igv对输入文件进行了什么处理（如何进行各种文件的可视化展示）"><span class="nav-number">1.3.</span> <span class="nav-text">igv对输入文件进行了什么处理（如何进行各种文件的可视化展示）?</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#igv如何进行大文件可视化的高性能？"><span class="nav-number">1.4.</span> <span class="nav-text">igv如何进行大文件可视化的高性能？</span></a><ol class="nav-child"><li class="nav-item nav-level-4"><a class="nav-link" href="#对于文件规模过大的问题："><span class="nav-number">1.4.1.</span> <span class="nav-text">对于文件规模过大的问题：</span></a></li></ol></li><li class="nav-item nav-level-3"><a class="nav-link" href="#igv用到的工具（为了提升性能以及进行中间处理）、如何使用的？"><span class="nav-number">1.5.</span> <span class="nav-text">igv用到的工具（为了提升性能以及进行中间处理）、如何使用的？</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#igv怎么实现支持本地数据和远程数据，如果是浏览器怎么实现实时的可视化并且加载大文件的性能不至于太差影响用户体验（exac）？"><span class="nav-number">1.6.</span> <span class="nav-text">igv怎么实现支持本地数据和远程数据，如果是浏览器怎么实现实时的可视化并且加载大文件的性能不至于太差影响用户体验（exac）？</span></a></li></ol></li><li class="nav-item nav-level-2"><a class="nav-link" href="#ExAC-Browser"><span class="nav-number">2.</span> <span class="nav-text">ExAC Browser</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#ExAC"><span class="nav-number">2.1.</span> <span class="nav-text">ExAC :</span></a><ol class="nav-child"><li class="nav-item nav-level-4"><a class="nav-link" href="#function"><span class="nav-number">2.1.1.</span> <span class="nav-text">function</span></a></li></ol></li><li class="nav-item nav-level-3"><a class="nav-link" href="#ExAC-Browser-1"><span class="nav-number">2.2.</span> <span class="nav-text">ExAC Browser :</span></a><ol class="nav-child"><li class="nav-item nav-level-4"><a class="nav-link" href="#可视化的机制是什么？"><span class="nav-number">2.2.1.</span> <span class="nav-text">可视化的机制是什么？</span></a><ol class="nav-child"><li class="nav-item nav-level-5"><a class="nav-link" href="#IGV-js"><span class="nav-number">2.2.1.1.</span> <span class="nav-text">IGV.js :</span></a><ol class="nav-child"><li class="nav-item nav-level-6"><a class="nav-link" href="#Data-server-requirements"><span class="nav-number">2.2.1.1.1.</span> <span class="nav-text">Data server requirements</span></a></li></ol></li></ol></li><li class="nav-item nav-level-4"><a class="nav-link" href="#文件读取的方式是什么（或者说对文件进行了什么处理）？"><span class="nav-number">2.2.2.</span> <span class="nav-text">文件读取的方式是什么（或者说对文件进行了什么处理）？</span></a></li></ol></li></ol></li><li class="nav-item nav-level-2"><a class="nav-link" href="#NGS流程"><span class="nav-number">3.</span> <span class="nav-text">NGS流程</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#vcftools"><span class="nav-number">4.</span> <span class="nav-text">vcftools</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#VCF"><span class="nav-number">4.1.</span> <span class="nav-text">VCF</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#VCFtools"><span class="nav-number">4.2.</span> <span class="nav-text">VCFtools</span></a><ol class="nav-child"><li class="nav-item nav-level-4"><a class="nav-link" href="#software-suite-软件集-：（split-into-two-modules"><span class="nav-number">4.2.1.</span> <span class="nav-text">software suite(软件集)：（split into two modules)</span></a></li><li class="nav-item nav-level-4"><a class="nav-link" href="#use-it"><span class="nav-number">4.2.2.</span> <span class="nav-text">use it</span></a></li></ol></li></ol></li><li class="nav-item nav-level-2"><a class="nav-link" href="#consortium"><span class="nav-number">5.</span> <span class="nav-text">consortium</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#icgc"><span class="nav-number">5.1.</span> <span class="nav-text">icgc</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#impc"><span class="nav-number">5.2.</span> <span class="nav-text">impc</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#giab"><span class="nav-number">5.3.</span> <span class="nav-text">giab</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#ENCODE-Project-consortium"><span class="nav-number">5.4.</span> <span class="nav-text">ENCODE Project consortium</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#seven-bridges"><span class="nav-number">5.5.</span> <span class="nav-text">seven bridges</span></a><ol class="nav-child"><li class="nav-item nav-level-4"><a class="nav-link" href="#seven-bridges-genome-browser"><span class="nav-number">5.5.1.</span> <span class="nav-text">seven bridges genome browser</span></a></li></ol></li></ol></li><li class="nav-item nav-level-2"><a class="nav-link" href="#CNV-detection"><span class="nav-number">6.</span> <span class="nav-text">CNV detection</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#cnvs"><span class="nav-number">6.1.</span> <span class="nav-text">cnvs :</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#CoNIFER"><span class="nav-number">6.2.</span> <span class="nav-text">CoNIFER</span></a><ol class="nav-child"><li class="nav-item nav-level-4"><a class="nav-link" href="#功能："><span class="nav-number">6.2.1.</span> <span class="nav-text">功能：</span></a></li><li class="nav-item nav-level-4"><a class="nav-link" href="#使用："><span class="nav-number">6.2.2.</span> <span class="nav-text">使用：</span></a></li></ol></li><li class="nav-item nav-level-3"><a class="nav-link" href="#XHMM"><span class="nav-number">6.3.</span> <span class="nav-text">XHMM</span></a><ol class="nav-child"><li class="nav-item nav-level-4"><a class="nav-link" href="#功能：-1"><span class="nav-number">6.3.1.</span> <span class="nav-text">功能：</span></a></li><li class="nav-item nav-level-4"><a class="nav-link" href="#使用：-1"><span class="nav-number">6.3.2.</span> <span class="nav-text">使用：</span></a></li></ol></li><li class="nav-item nav-level-3"><a class="nav-link" href="#Codex"><span class="nav-number">6.4.</span> <span class="nav-text">Codex</span></a><ol class="nav-child"><li class="nav-item nav-level-4"><a class="nav-link" href="#功能"><span class="nav-number">6.4.1.</span> <span class="nav-text">功能</span></a></li><li class="nav-item nav-level-4"><a class="nav-link" href="#使用"><span class="nav-number">6.4.2.</span> <span class="nav-text">使用</span></a></li></ol></li></ol></li><li class="nav-item nav-level-2"><a class="nav-link" href="#data-sharing"><span class="nav-number">7.</span> <span class="nav-text">data sharing</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#DECIPHER"><span class="nav-number">7.1.</span> <span class="nav-text">DECIPHER</span></a><ol class="nav-child"><li class="nav-item nav-level-4"><a class="nav-link" href="#download"><span class="nav-number">7.1.1.</span> <span class="nav-text">download :</span></a></li><li class="nav-item nav-level-4"><a class="nav-link" href="#browser"><span class="nav-number">7.1.2.</span> <span class="nav-text">browser</span></a></li></ol></li><li class="nav-item nav-level-3"><a class="nav-link" href="#Cafe-variome"><span class="nav-number">7.2.</span> <span class="nav-text">Cafe variome</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#GeneMatcher"><span class="nav-number">7.3.</span> <span class="nav-text">GeneMatcher</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#RD-connect"><span class="nav-number">7.4.</span> <span class="nav-text">RD-connect</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#PhenomeCentral"><span class="nav-number">7.5.</span> <span class="nav-text">PhenomeCentral</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#MatchMaker-Exchange"><span class="nav-number">7.6.</span> <span class="nav-text">MatchMaker Exchange</span></a></li></ol></li><li class="nav-item nav-level-2"><a class="nav-link" href="#phenotype"><span class="nav-number">8.</span> <span class="nav-text">phenotype</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#phenotips"><span class="nav-number">8.1.</span> <span class="nav-text">phenotips</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#PhenoDB"><span class="nav-number">8.2.</span> <span class="nav-text">PhenoDB</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#Phenominer"><span class="nav-number">8.3.</span> <span class="nav-text">Phenominer</span></a></li></ol></li><li class="nav-item nav-level-2"><a class="nav-link" href="#data-compression"><span class="nav-number">9.</span> <span class="nav-text">data compression</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#sam-bam-cram"><span class="nav-number">9.1.</span> <span class="nav-text">sam,bam,cram</span></a><ol class="nav-child"><li class="nav-item nav-level-4"><a class="nav-link" href="#sam-bam-format"><span class="nav-number">9.1.1.</span> <span class="nav-text">sam/bam format</span></a></li><li class="nav-item nav-level-4"><a class="nav-link" href="#cram"><span class="nav-number">9.1.2.</span> <span class="nav-text">cram</span></a></li></ol></li><li class="nav-item nav-level-3"><a class="nav-link" href="#tabix"><span class="nav-number">9.2.</span> <span class="nav-text">tabix</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#gqt"><span class="nav-number">9.3.</span> <span class="nav-text">gqt</span></a></li></ol></li><li class="nav-item nav-level-2"><a class="nav-link" href="#cloud-tools"><span class="nav-number">10.</span> <span class="nav-text">cloud tools</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#VAT"><span class="nav-number">10.1.</span> <span class="nav-text">VAT</span></a><ol class="nav-child"><li class="nav-item nav-level-4"><a class="nav-link" href="#core-modules"><span class="nav-number">10.1.1.</span> <span class="nav-text">core modules</span></a></li><li class="nav-item nav-level-4"><a class="nav-link" href="#web端"><span class="nav-number">10.1.2.</span> <span class="nav-text">web端</span></a></li></ol></li><li class="nav-item nav-level-3"><a class="nav-link" href="#mercury"><span class="nav-number">10.2.</span> <span class="nav-text">mercury</span></a><ol class="nav-child"><li class="nav-item nav-level-4"><a class="nav-link" href="#workflow"><span class="nav-number">10.2.1.</span> <span class="nav-text">workflow</span></a></li><li class="nav-item nav-level-4"><a class="nav-link" href="#cloud-workflow-and-local-workflow"><span class="nav-number">10.2.2.</span> <span class="nav-text">cloud workflow and local workflow</span></a></li></ol></li></ol></li></ol></div>
            

          </div>
        </section>
      <!--/noindex-->
      

      

    </div>
  </aside>


        
      </div>
    </main>

    <footer id="footer" class="footer">
      <div class="footer-inner">
        <div class="copyright">&copy; <span itemprop="copyrightYear">2020</span>
  <span class="with-love">
    <i class="fa fa-user"></i>
  </span>
  <span class="author" itemprop="copyrightHolder">renhao</span>

  
    <span class="post-meta-divider">|</span>
    <span class="post-meta-item-icon">
      <i class="fa fa-area-chart"></i>
    </span>
    
      <span class="post-meta-item-text">Site words total count&#58;</span>
    
    <span title="Site words total count">265.9k</span>
  
</div>


  <div class="powered-by">由 <a class="theme-link" target="_blank" href="https://hexo.io">Hexo</a> 强力驱动</div>



  <span class="post-meta-divider">|</span>



  <div class="theme-info">主题 &mdash; <a class="theme-link" target="_blank" href="https://github.com/iissnan/hexo-theme-next">NexT.Muse</a> v5.1.4</div>




        







        
      </div>
    </footer>

    
      <div class="back-to-top">
        <i class="fa fa-arrow-up"></i>
        
      </div>
    

    

  </div>

  

<script type="text/javascript">
  if (Object.prototype.toString.call(window.Promise) !== '[object Function]') {
    window.Promise = null;
  }
</script>









  












  
  
    <script type="text/javascript" src="/lib/jquery/index.js?v=2.1.3"></script>
  

  
  
    <script type="text/javascript" src="/lib/fastclick/lib/fastclick.min.js?v=1.0.6"></script>
  

  
  
    <script type="text/javascript" src="/lib/jquery_lazyload/jquery.lazyload.js?v=1.9.7"></script>
  

  
  
    <script type="text/javascript" src="/lib/velocity/velocity.min.js?v=1.2.1"></script>
  

  
  
    <script type="text/javascript" src="/lib/velocity/velocity.ui.min.js?v=1.2.1"></script>
  

  
  
    <script type="text/javascript" src="/lib/fancybox/source/jquery.fancybox.pack.js?v=2.1.5"></script>
  


  


  <script type="text/javascript" src="/js/src/utils.js?v=5.1.4"></script>

  <script type="text/javascript" src="/js/src/motion.js?v=5.1.4"></script>



  
  

  
  <script type="text/javascript" src="/js/src/scrollspy.js?v=5.1.4"></script>
<script type="text/javascript" src="/js/src/post-details.js?v=5.1.4"></script>



  


  <script type="text/javascript" src="/js/src/bootstrap.js?v=5.1.4"></script>



  


  




	





  





  










  <script src="//cdn1.lncld.net/static/js/3.0.4/av-min.js"></script>
  <script src="//unpkg.com/valine/dist/Valine.min.js"></script>
  
  <script type="text/javascript">
    var GUEST = ['nick','mail','link'];
    var guest = 'nick,mail,link';
    guest = guest.split(',').filter(item=>{
      return GUEST.indexOf(item)>-1;
    });
    new Valine({
        el: '#comments' ,
        verify: false,
        notify: false,
        appId: '2P1xqUWiCoNm14MH4yhURlJi-gzGzoHsz',
        appKey: 'uJN0uagHIVgXtBO1OuLV9Ban',
        placeholder: 'Just go go',
        avatar:'mm',
        guest_info:guest,
        pageSize:'10' || 10,
    });
  </script>



  





  

  

  

  
  

  

  

  

</body>
</html>
